Author: Francina Cabrera Fermin
CASA UCL 2021
This document provides instructions to extract informal settlements using spectral and texture features from 3 metres resolution satellite scenes from PlanetScope. This project creates several Random Forest Classification model based on the spectral and gray level co-occurrence matrix (variance) values of satellite image pixels.
The data is stored in an online github repository: https://github.com/francicabrera/DI_Francina2021.git
This code follows the process from:
https://pages.cms.hu-berlin.de/EOL/gcg_eo/index.html
https://andrewmaclachlan.github.io/CASA0005repo/advanced-raster-analysis.html
## Install packages and load the required libraries
library(sf) # to read layers
library(tmap) # visualisation
library(googledrive) # to load data from google drive
library(here) # file referencing
library(raster) # to work with rasters
library(sp)
library(GGally) # to combine maps in a single plot
library(tidyverse) # data management
library(ggplot2) # to make maps and graphics
library(reshape2) # data management
library(caret) # create data partition
library(e1071) # to optimise the mtry parameter
library(randomForest) # to build the classification model
library(glcm) # to create GLCM model
library(RStoolbox) # to perform PCA
library(ggsn) # visualisation
library(cowplot) # to arrange inset maps
# Dominican Republic Provinces (for visualisation only)
provinces <- st_read(here("data","geo", "PROVCenso2010.shp")) %>%
# Project to EPSG:32619. This is the projected coordinate system for the Dominican Republic.
st_transform(.,32619)
## Reading layer `PROVCenso2010' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/geo/PROVCenso2010.shp' using driver `ESRI Shapefile'
## Simple feature collection with 32 features and 4 fields
## geometry type: MULTIPOLYGON
## dimension: XY
## bbox: xmin: 182215.8 ymin: 1933512 xmax: 571429.3 ymax: 2205216
## projected CRS: WGS 84 / UTM zone 19N
# Distrito Nacional's boundary (for visualisation only)
dn_boundary <- st_read(here("data","geo","DN_boundary.shp")) %>%
# Project to EPSG:32619. This is the projected coordinate system for the Dominican Republic.
st_transform(.,32619)
## Reading layer `DN_boundary' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/geo/DN_boundary.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 4 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: 394514.9 ymin: 2037241 xmax: 407670.6 ymax: 2051052
## projected CRS: WGS 84 / UTM zone 19N
# Distrito Nacional's circumscriptions (for visualisation only)
dn_circ <- st_read(here("data","geo","DN_circumscriptions.shp")) %>%
# Project to EPSG:32619. This is the projected coordinate system for the Dominican Republic.
st_transform(.,32619)
## Reading layer `DN_circumscriptions' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/geo/DN_circumscriptions.shp' using driver `ESRI Shapefile'
## Simple feature collection with 3 features and 7 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: 394514.9 ymin: 2037241 xmax: 407670.6 ymax: 2051052
## projected CRS: WGS 84 / UTM zone 19N
# Circumscription 3
c3_boundary <- st_read(here("data","geo","Circ3.shp")) %>%
# Project to EPSG:32619. This is the projected coordinate system for the Dominican Republic.
st_transform(.,32619)
## Reading layer `Circ3' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/geo/Circ3.shp' using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 7 fields
## geometry type: POLYGON
## dimension: XY
## bbox: xmin: 403336.9 ymin: 2043038 xmax: 407670.6 ymax: 2047337
## projected CRS: WGS 84 / UTM zone 19N
# The images will be downloaded from Google Drive.
# First scene:
# Create a temporary file that will be substituted with the downloaded raster.
temp49 <- tempfile(fileext = ".tif")
# Download the raster with the file's id from Google Drive.
dl49 <- drive_download(as_id("1ftQXEszRYiHfiCDkm1kJ3parha85569w"),
path = temp49, overwrite = TRUE)
# Create the raster stack
raster49 <- stack(temp49)
# Check the coordinate system is WGS 84 / UTM zone 19N,
# if not use: %>% projectRaster(., crs=32619)
crs(raster49)
## CRS arguments:
## +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
# Second scene:
# Create a temporary file that will be substituted with the downloaded raster.
temp50 <- tempfile(fileext = ".tif")
# Download the raster with the file's id from Google Drive.
dl50 <- drive_download(as_id("1But-SQaZkdazRBNMUKXjocLyiD7GQkJL"),
path = temp50, overwrite = TRUE)
# Create the raster stack
raster50 <- stack(temp50)
# Check the coordinate system is WGS 84 / UTM zone 19N,
# if not use: %>% projectRaster(., crs=32619)
crs(raster50)
## CRS arguments:
## +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
# Quick look at the images by the different bands
plot(raster49, main="PlanetScope Distrito Nacional Scene 1")
plot(raster50, main="PlanetScope Distrito Nacional Scene 2")
PlanetScope provides Usable Data Masks (UDM) to assess the data quality of their images. See more here: https://developers.planet.com/docs/data/udm-2/ Let’s verify the appearance of pixels covered with clouds or shadow.
# Create a temporary file that will be substituted with the downloaded UDM raster.
temp49_mask <- tempfile(fileext = ".tif")
# Download the UDM raster with the file's id from Google Drive.
dl49_mask <- drive_download(as_id("18r619B2o98q1JHxtkSyPFArvVKqKcpev"),
path = temp49_mask, overwrite = TRUE)
# Create the raster stack
raster49_mask <- stack(temp49_mask)
# Check the coordinate system is WGS 84 / UTM zone 19N,
# if not use: %>% projectRaster(., crs=32619)
crs(raster49_mask)
## CRS arguments:
## +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
Consider the meaning of 0 as “FALSE” or “NO”, 1 equals “TRUE” or “YES”. Or, 0: no cloud, 1: cloud; 0: no shadow, 1: shadow. See more here: https://pages.cms.hu-berlin.de/EOL/gcg_eo/02_data_quality.html
freq(raster49_mask$cloud)
## value count
## [1,] 0 30305400
Repeat the same process with the UDM raster from the second scene.
# Create a temporary file that will be substituted with the downloaded raster.
temp50_mask <- tempfile(fileext = ".tif")
# Download the UDM raster with the file's id from Google Drive.
dl50_mask <- drive_download(as_id("18HnyLNzhswsV2S3b1EokS5OHXKsesZ99"),
path = temp50_mask, overwrite = TRUE)
# Create the raster stack
raster50_mask <- stack(temp50_mask)
# Check the coordinate system is WGS 84 / UTM zone 19N,
# if not use: %>% projectRaster(., crs=32619)
crs(raster50_mask)
## CRS arguments:
## +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
# Find the most frequent values using freq().
freq(raster50_mask$cloud)
## value count
## [1,] 0 30277741
As both raster show no signal of clouds, the masks wont be used. Therefore, we will remain with the original scenes.
Let’s create a mosaic out of the two scenes
See more here: http://www.qgistutorials.com/en/docs/3/raster_mosaicing_and_clipping.html
# Merge the two original scenes to create a mosaic using the function mosaic()
raster_mosaic <- mosaic(raster49,
raster50,
fun=mean,
tolerance=0.05,
filename="raster_mosaic",
overwrite=TRUE)
# Quick look at the mosaic
plot(raster_mosaic, main="Mosaic of PlanetScope scenes")
# Crop the mosaic to the Distrito Nacional's boundary with crop()
# This raster will be used for visualisation only.
mosaic_DN <- crop(raster_mosaic,
dn_boundary,
filename="mosaicDN_crop",
overwrite=TRUE) %>%
mask(., dn_boundary)
# Crop the mosaic to the Circumscription 3 (C3)'s boundary with crop()
mosaic_C3 <- crop(raster_mosaic,
c3_boundary,
filename="mosaicC3_crop",
overwrite=TRUE) %>%
mask(., c3_boundary)
Information on the Circumscription 3 (C3)’s raster stack.
extent(mosaic_C3) # extent
## class : Extent
## xmin : 403338
## xmax : 407670
## ymin : 2043039
## ymax : 2047338
ncell(mosaic_C3) # number of cells
## [1] 2069252
dim(mosaic_C3) # number of rows, columns, layers
## [1] 1433 1444 4
nlayers(mosaic_C3) # number of layers
## [1] 4
res(mosaic_C3) # xres, yres
## [1] 3 3
# Mosaic
names(raster_mosaic) <- c('blue', 'green', 'red', 'NIR')
# DN's mosaic
names(mosaic_DN) <- c('blue', 'green', 'red', 'NIR')
# C3' mosaic
names(mosaic_C3) <- c('blue', 'green', 'red', 'NIR')
# C3
# true colour composite
rC3_rgb <- stack(mosaic_C3$red, mosaic_C3$green, mosaic_C3$blue) %>%
plotRGB(.,axes=TRUE, stretch="lin")
# false colour composite
rC3_false <- stack(mosaic_C3$NIR, mosaic_C3$red, mosaic_C3$green) %>%
plotRGB(.,axes=TRUE, stretch="lin")
The training data was collected digitally using a very high resolution (VHR) imagery as a reference: the Google maps terrain in QGIS.
The location of the informal settlements in the Distrito Nacional was obtained from: https://web.one.gob.do/publicaciones/2016/estudio-metodologia-para-la-identificacion-de-tugurios-en-el-distrito-nacional-censo-2010/
Following the classification for informal and formal settlements from: https://ieeexplore.ieee.org/document/6236225 and https://doi.org/10.1016/j.compenvurbsys.2011.11.001, the classification follows six classes:
Informal settlements Type I: Slums with an irregular shape, high density of buildings, and where paved roads could be visually identified. Roofs: grey, brown, red, and white. Same building with varying colours.
Informal settlements Type II: Slums with an irregular shape, high density of buildings, and undefined roads. Roofs: grey, brown, red, and white. Same building with varying colours.
Informal settlements Type III: Slums with an irregular shape, high density of buildings, undefined roads, and proximity to hazardous elements (rivers or ravines). Roofs: grey, brown, red, and white. Same building with varying colours. Hazardous arrangement.
Formal settlement: Settlements with regular shape, regular density of buildings and paved roads.
Non-settlements: Bare ground, grassland, forest, water, parking lots and sports courts.
Roads and asphalt: Paved roads.
# Read training points.
trainC3 <- st_read(here("data", "training_data","TrainingData_C3.shp")) %>%
# Project to EPSG:32619.
st_transform(.,32619)
## Reading layer `TrainingData_C3' from data source `/Users/francinacabrera/Documents/CASA/DI_Francina2021/Slum_detection/data/training_data/TrainingData_C3.shp' using driver `ESRI Shapefile'
## Simple feature collection with 11733 features and 4 fields
## geometry type: POINT
## dimension: XY
## bbox: xmin: 403348 ymin: 2043067 xmax: 407656.7 ymax: 2047330
## projected CRS: WGS 84 / UTM zone 19N
# Quick look of the feature
qtm(trainC3)
crs(trainC3) # Check crs
## CRS arguments:
## +proj=utm +zone=19 +datum=WGS84 +units=m +no_defs
# Check how many data points per class are
trainC3 %>% group_by(classID) %>% count()
## Simple feature collection with 6 features and 2 fields
## geometry type: MULTIPOINT
## dimension: XY
## bbox: xmin: 403348 ymin: 2043067 xmax: 407656.7 ymax: 2047330
## projected CRS: WGS 84 / UTM zone 19N
## # A tibble: 6 x 3
## classID n geometry
## * <dbl> <int> <MULTIPOINT [m]>
## 1 1 885 ((404114.5 2045755), (404119 2045677), (404126.3 2045593), (404…
## 2 2 3292 ((403493.6 2046463), (403495.7 2046399), (403498.3 2046478), (4…
## 3 3 627 ((403456.5 2046842), (403471.2 2046829), (403510.5 2046837), (4…
## 4 4 4444 ((403369.7 2046079), (403370.2 2046187), (403372.5 2046123), (4…
## 5 5 1071 ((403358.1 2046402), (403360.3 2046352), (403363.1 2046341), (4…
## 6 6 1414 ((403348 2046127), (403350.9 2046049), (403353.7 2045969), (403…
# Extract image values at training point locations
trainC3.sp <- raster::extract(mosaic_C3, trainC3, sp=T)
# Convert to data.frame
trainC3.df <- as.data.frame(trainC3.sp)
This will allow you to identify any outliers in the data.
# Melt dataframe containing point id, classID, and 6 spectral bands
spectra.df <- melt(trainC3.df, id.vars='classID',
measure.vars=c('blue', 'green', 'red', 'NIR'))
# Create boxplots of spectral bands per class
ggplot(spectra.df, aes(x=variable, y=value, color=classID)) +
geom_boxplot() +
theme_bw()
The function that we’ll use to train the classification model expects the dependent variable to be of type factor.
# Use as.factor() for conversion of the classID column.
trainC3.df$classID <- as.factor(trainC3.df$classID)
str(trainC3.df) #allows you to see the classes of the variables (all numeric)
## 'data.frame': 11733 obs. of 10 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ classID : Factor w/ 6 levels "1","2","3","4",..: 4 5 2 4 4 4 4 4 4 4 ...
## $ class_name: chr "formal_settlement" "non_settlement" "informal_settlement_type2" "formal_settlement" ...
## $ circ_num : num 3 3 3 3 3 3 3 3 3 3 ...
## $ blue : int 547 1192 761 1955 773 1489 1405 989 1383 541 ...
## $ green : int 632 1396 839 2161 858 1534 1472 1137 1507 618 ...
## $ red : int 523 1638 833 2198 810 1457 1482 1152 1369 529 ...
## $ NIR : int 965 2367 1188 2655 1318 1875 1787 1750 1699 1091 ...
## $ coords.x1 : num 404857 404829 403756 403969 403935 ...
## $ coords.x2 : num 2045945 2045801 2046283 2046086 2046240 ...
# Include useful predictors.
trainC3_df <- select(trainC3.df, -c("id", "class_name","circ_num","coords.x1","coords.x2"))
# The classification model cannot deal with NoData (NA) values. Remove NAs from the data.frame.
sum(is.na(trainC3_df)) # check how many values are null
## [1] 0
trainC3_df <- na.omit(trainC3_df)
sum(is.na(trainC3_df))
## [1] 0
Stratified random sampling helps us to validate the map using a sufficient amount of samples for each for the four classes
See more about this here: https://stackoverflow.com/questions/20776887/stratified-splitting-the-data/30181396
# createDataPartition does a stratified random split of the data.
# Set a portion of the data aside for testing
sample <- createDataPartition(trainC3_df$classID, p = .80, list = FALSE)
train <- trainC3_df[sample,]
test <- trainC3_df[-sample,]
# Check how many data points are per class
train %>% group_by(classID) %>% count()
## # A tibble: 6 x 2
## # Groups: classID [6]
## classID n
## <fct> <int>
## 1 1 708
## 2 2 2634
## 3 3 502
## 4 4 3556
## 5 5 857
## 6 6 1132
test %>% group_by(classID) %>% count()
## # A tibble: 6 x 2
## # Groups: classID [6]
## classID n
## <fct> <int>
## 1 1 177
## 2 2 658
## 3 3 125
## 4 4 888
## 5 5 214
## 6 6 282
# Check the dimension of the objects
dim(train)
## [1] 9389 5
dim(test)
## [1] 2344 5
This section creates a classification model using the spectral bands of the mosaic.
This section follows the code from this source: https://pages.cms.hu-berlin.de/EOL/gcg_eo/05_machine_learning.html
This part of the process uses the package: “e1071”. We will optimise the mtry parameter.
Number of trees (ntree): it is unnecessary to tune in the ntree, instead it is recommendednto set it to a large number and compare across multiple runs of the model.
Number of variables (mtry): set the number of k-folds that will run to find the optimal parameter.
Read more here: https://stats.stackexchange.com/questions/348245/do-we-have-to-tune-the-number-of-trees-in-a-random-forest
Other sources: https://www.youtube.com/watch?v=v5Bmz2eMd7M
# Define accuracy from 5-fold cross-validation as optimization measure
cv <- tune.control(cross = 5)
# Use tune.randomForest to assess the optimal combination of ntree and mtry
RF_modelC3.tune <- tune.randomForest(classID~.,
data = train,
ntree=750,
mtry=c(2:10),
importance = TRUE,
tunecontrol = cv)
# Store the best model in a new object for further use
RF_modelC3 <- RF_modelC3.tune$best.model
print(RF_modelC3)
##
## Call:
## best.randomForest(x = classID ~ ., data = train, mtry = c(2:10), ntree = 750, importance = TRUE, tunecontrol = cv)
## Type of random forest: classification
## Number of trees: 750
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 54.17%
## Confusion matrix:
## 1 2 3 4 5 6 class.error
## 1 20 236 9 373 15 55 0.9717514
## 2 45 1002 89 1150 88 260 0.6195900
## 3 3 155 148 58 95 43 0.7051793
## 4 53 831 18 2369 49 236 0.3338020
## 5 3 178 60 121 398 97 0.5355893
## 6 25 335 29 331 46 366 0.6766784
# Check at which number of trees the model stabilises.
plot(RF_modelC3)
# Check which variables provide more information
# Calculate variable importance with importance()
RF1_imp <- importance(RF_modelC3, type = 1) # select type 1 for Mean decrease accuracy
typeof(RF1_imp)
## [1] "double"
# Convert to a a data table
dt1 <- data.table::as.data.table(RF1_imp, keep.rownames = "var") %>%
.[order(-MeanDecreaseAccuracy)]
dt1
## var MeanDecreaseAccuracy
## 1: NIR 120.66887
## 2: blue 112.57748
## 3: red 71.93014
## 4: green 57.23620
# Plot the graph
Graph3a <- ggplot(data=dt1, aes(x= reorder(var, MeanDecreaseAccuracy), y = MeanDecreaseAccuracy)) +
labs(title="RFC\n(spectral)",
x="", y = "") +
geom_bar(stat="identity", fill='#f6e8c3', width=0.5) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip()
Graph3a
Perform a classification of the image stack using the predict() function.
# Run predict() to store RF predictions
map <- predict(mosaic_C3, RF_modelC3)
# Plot raster
plot(map)
freq(map)
## value count
## [1,] 1 21461
## [2,] 2 454440
## [3,] 3 61075
## [4,] 4 583038
## [5,] 5 149557
## [6,] 6 193753
## [7,] NA 605928
# Uncomment to write classification to disk.
# writeRaster(map, filename="predicted_map", datatype="INT1S", overwrite=T)
Area Adjusted Accuracy assessment calculated as in http://reddcr.go.cr/sites/default/files/centro-de-documentacion/olofsson_et_al._2014_-_good_practices_for_estimating_area_and_assessing_accuracy_of_land_change.pdf
Code source: https://blogs.fu-berlin.de/reseda/area-adjusted-accuracies/
# Extract the predictions from the model, using the test dataset
predClass <- predict(RF_modelC3, test)
# Use as.factor() for conversion of the classID column in the test dataset
test_acc <- as.factor(test$classID)
# Build a contingency table of the counts at each combination of factor levels.
# Rename the vectors accordingly.
confmat <- table("pred" = predClass, "ref" = test_acc)
# get number of pixels per class and convert in km²
imgVal <- as.factor(getValues(map))
# Remove NA values from the classified dataset
imgVal <- na.omit(imgVal)
# Extract the number of classes
nclass <- length(unique(train$classID))
# Calculate the total area per class
maparea <- sapply(1:nclass, function(x) sum(imgVal == x))
# Transform area in km2
maparea <- maparea * res(map)[1] ^ 2 / 1000000
# total map area
A <- sum(maparea)
# proportion of area mapped as class i
W_i <- maparea / A
# number of reference points per class
n_i <- rowSums(confmat)
# population error matrix
p <- W_i * confmat / n_i
p[is.na(p)] <- 0
# area estimation
p_area <- colSums(p) * A
# overall accuracy
OA <- sum(diag(p))
# producers accuracy
PA <- diag(p) / colSums(p)
# users accuracy
UA <- diag(p) / rowSums(p)
# gather results
result <- matrix(c(p_area, PA * 100, UA * 100, c(OA * 100, rep(NA, nclass-1))), nrow = nclass)
result <- round(result, digits = 2)
rownames(result) <- levels(as.factor(train$classID))
colnames(result) <- c("km²","PA", "UA", "OA")
class(result) <- "table"
result
## km² PA UA OA
## 1 0.96 0.56 2.78 44.32
## 2 3.63 40.60 36.02
## 3 0.80 30.74 44.71
## 4 4.65 60.30 53.47
## 5 1.43 48.95 52.00
## 6 1.70 35.72 34.84
Now, we will add texture features to see if it improves the slum detection.
This code follows the methodology of: https://ieeexplore.ieee.org/document/7447704
Code Source: https://zia207.github.io/geospatial-r-github.io/texture-analysis.html#texture-analysis.
We’ll calculate the GLCM for a 5 x 5 kernel or window size. To calculate for different kernel (15x15, 21x21,…), change the “window” parameter.
# Calculate over all directions
# Red band
texturesRed <- glcm(raster(mosaic_C3, layer=1),
window = c(5,5),
statistics = "variance",
shift=list(c(0,1), c(1,1), c(1,0), c(1,-1)))
plot(texturesRed)
# Green band
texturesGreen <- glcm(raster(mosaic_C3, layer=2),
window = c(5,5),
statistics = "variance",
shift=list(c(0,1), c(1,1), c(1,0), c(1,-1)))
plot(texturesGreen)
# Blue band
texturesBlue <- glcm(raster(mosaic_C3, layer=3),
window = c(5,5),
statistics = "variance",
shift=list(c(0,1), c(1,1), c(1,0), c(1,-1)))
plot(texturesBlue)
# NIR band
texturesNIR <- glcm(raster(mosaic_C3, layer=4),
window = c(5,5),
statistics = "variance",
shift=list(c(0,1), c(1,1), c(1,0), c(1,-1)))
plot(texturesNIR)
To decrease redundancy of texture bands and to determine the appropriate texture features, we will apply PCA to all texture images.
The function rasterPCA() will calculate the PCA of our raster stack and will return a raster brick with multiple layers of PCA scores.
Source code: https://zia207.github.io/geospatial-r-github.io/texture-analysis.html#texture-analysis
# Stack the glcm layers of all bands
mosaic_C3glcm <- stack(texturesRed, texturesGreen, texturesBlue, texturesNIR)
# Develop a PCA model
mosaic_C3glcm2 <- scale(mosaic_C3glcm) # scale the data
mosaic_C3glcm2[is.na(mosaic_C3glcm2)] <- 0 # define zero all missing values
mosaic_C3PCA <- rasterPCA(mosaic_C3glcm2, nComp=4)
summary(mosaic_C3PCA$model)
## Importance of components:
## Comp.1 Comp.2 Comp.3 Comp.4
## Standard deviation 1.494540 0.7157898 0.18889005 0.124274994
## Proportion of Variance 0.798551 0.1831718 0.01275574 0.005521476
## Cumulative Proportion 0.798551 0.9817228 0.99447852 1.000000000
# Extract first 2 PCs from the model
# Since, the first two PCs account for more of the variability of these textures (see standard deviation), we will extract these 2 components.
# The values from these raster layers will be used as variables for our classification.
PCA1 <- mosaic_C3PCA$map$PC1
PCA2 <- mosaic_C3PCA$map$PC2
# Stack into one raster.
PC_glcm <- stack(PCA1, PCA2)
# Stack with the original mosaic
mosaic_C3ST <- stack(mosaic_C3, PC_glcm)
# Extract image values at training point locations
trainC3.spGLCM <- raster::extract(PC_glcm, trainC3, sp=T)
# Convert to data.frame
trainC3.dfGLCM <- as.data.frame(trainC3.spGLCM)
# Merge spectral and glcm values
trainC3.dfST <- merge(trainC3.df,trainC3.dfGLCM, by = "id")
# Rename column classID.x as classID
names(trainC3.dfST)[2] <- 'classID'
# Use as.factor() for conversion of the classID column.
trainC3.dfST$classID <- as.factor(trainC3.dfST$classID)
str(trainC3.dfST) # allows you to see the classes of the variables (all numeric)
## 'data.frame': 11733 obs. of 17 variables:
## $ id : num 1 2 3 4 5 6 7 8 9 10 ...
## $ classID : Factor w/ 6 levels "1","2","3","4",..: 4 5 2 4 4 4 4 4 4 4 ...
## $ class_name.x: chr "formal_settlement" "non_settlement" "informal_settlement_type2" "formal_settlement" ...
## $ circ_num.x : num 3 3 3 3 3 3 3 3 3 3 ...
## $ blue : int 547 1192 761 1955 773 1489 1405 989 1383 541 ...
## $ green : int 632 1396 839 2161 858 1534 1472 1137 1507 618 ...
## $ red : int 523 1638 833 2198 810 1457 1482 1152 1369 529 ...
## $ NIR : int 965 2367 1188 2655 1318 1875 1787 1750 1699 1091 ...
## $ coords.x1.x : num 404857 404829 403756 403969 403935 ...
## $ coords.x2.x : num 2045945 2045801 2046283 2046086 2046240 ...
## $ classID.y : num 4 5 2 4 4 4 4 4 4 4 ...
## $ class_name.y: chr "formal_settlement" "non_settlement" "informal_settlement_type2" "formal_settlement" ...
## $ circ_num.y : num 3 3 3 3 3 3 3 3 3 3 ...
## $ PC1 : num -0.817 2.798 -0.401 5.3 -0.318 ...
## $ PC2 : num 0.0432 -2.8057 0.2067 -2.6092 0.2352 ...
## $ coords.x1.y : num 404857 404829 403756 403969 403935 ...
## $ coords.x2.y : num 2045945 2045801 2046283 2046086 2046240 ...
# Include only useful predictors in the model.
trainC3_dfST <- select(trainC3.dfST, -c("id", "class_name.x","circ_num.x","coords.x1.x","coords.x2.x",
"classID.y","class_name.y","circ_num.y","coords.x1.y","coords.x2.y"))
# The RF algorithm cannot deal with NoData (NA) values. Remove NAs from the data.frame.
sum(is.na(trainC3_dfST)) # check how many null values there are
## [1] 0
trainC3_dfST <- na.omit(trainC3_dfST)
sum(is.na(trainC3_dfST))
## [1] 0
# Set a portion of the data aside for testing
sampleST <- createDataPartition(trainC3_dfST$classID, p = .80, list = FALSE)
trainST <- trainC3_dfST[sampleST,]
testST <- trainC3_dfST[-sampleST,]
# Check how many data points are per class
trainST %>% group_by(classID) %>% count()
## # A tibble: 6 x 2
## # Groups: classID [6]
## classID n
## <fct> <int>
## 1 1 708
## 2 2 2634
## 3 3 502
## 4 4 3556
## 5 5 857
## 6 6 1132
testST %>% group_by(classID) %>% count()
## # A tibble: 6 x 2
## # Groups: classID [6]
## classID n
## <fct> <int>
## 1 1 177
## 2 2 658
## 3 3 125
## 4 4 888
## 5 5 214
## 6 6 282
# Define accuracy from 5-fold cross-validation as optimization measure
cvST <- tune.control(cross = 5)
# Use tune.randomForest to assess the optimal combination of ntree and mtry
RF_modelC3ST.tune <- tune.randomForest(classID~.,
data = trainST,
ntree=750,
mtry=c(2:10),
importance = TRUE,
tunecontrol = cvST)
# Store the best model in a new object for further use
RF_modelC3ST <- RF_modelC3ST.tune$best.model
print(RF_modelC3ST)
##
## Call:
## best.randomForest(x = classID ~ ., data = trainST, mtry = c(2:10), ntree = 750, importance = TRUE, tunecontrol = cvST)
## Type of random forest: classification
## Number of trees: 750
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 51.87%
## Confusion matrix:
## 1 2 3 4 5 6 class.error
## 1 15 247 1 383 15 47 0.9788136
## 2 28 1047 77 1155 89 238 0.6025057
## 3 1 151 143 79 106 22 0.7151394
## 4 30 798 13 2454 52 209 0.3098988
## 5 1 130 73 143 452 58 0.4725788
## 6 13 287 12 361 51 408 0.6395760
# Calculate variable importance with importance()
RF2_imp <- importance(RF_modelC3ST, type = 1) # select type 1 for Mean decrease accuracy
typeof(RF2_imp)
## [1] "double"
# Convert to a a data table
dt2 <- data.table::as.data.table(RF2_imp, keep.rownames = "var") %>%
.[order(-MeanDecreaseAccuracy)] %>%
.[var == "PC1", var := "GLCM Red"] %>% # Rename GLCM variables
.[var == "PC2", var := "GLCM Green"]
# Plot the graph
Graph3b <- ggplot(data=dt2, aes(x= reorder(var, MeanDecreaseAccuracy), y = MeanDecreaseAccuracy)) +
labs(title="RFC\n(spectral + texture)\n5 x 5",
x="", y = "") +
geom_bar(stat="identity", fill='#f6e8c3', width=0.5) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip()
Graph3b
Perform a classification of the image stack using the predict() function.
# Run predict() to store RF predictions
mapST <- predict(mosaic_C3ST, RF_modelC3ST)
# Plot raster
plot(mapST)
freq(mapST)
## value count
## [1,] 1 10615
## [2,] 2 447154
## [3,] 3 45705
## [4,] 4 605586
## [5,] 5 167633
## [6,] 6 186631
## [7,] NA 605928
# Uncomment to write classification to disk.
# writeRaster(mapST, filename="predicted_mapST", datatype="INT1S", overwrite=T)
# Extract the predictions from the model, using the test dataset
predClassST <- predict(RF_modelC3ST, testST)
# Use as.factor() for conversion of the classID column in the test dataset
test_accST <- as.factor(testST$classID)
# Build a contingency table of the counts at each combination of factor levels.
# Rename the vectors accordingly.
confmat_ST <- table("pred" = predClassST, "ref" = test_accST)
# get number of pixels per class and convert in km²
imgVal_ST <- as.factor(getValues(mapST))
# Remove NA values from the classified dataset
imgVal_ST <- na.omit(imgVal_ST)
# Extract the number of classes
nclass_ST <- length(unique(trainST$classID))
# Calculate the total area per class
maparea_ST <- sapply(1:nclass_ST, function(x) sum(imgVal_ST == x))
# Transform area in km2
maparea_ST <- maparea_ST * res(mapST)[1] ^ 2 / 1000000
# total map area
A_ST <- sum(maparea_ST)
# proportion of area mapped as class i
W_i_ST <- maparea_ST / A_ST
# number of reference points per class
n_i_ST <- rowSums(confmat_ST)
# population error matrix
p_ST <- W_i_ST * confmat_ST / n_i_ST
p_ST[is.na(p_ST)] <- 0
# area estimation
p_area_ST <- colSums(p_ST) * A_ST
# overall accuracy
OA_ST <- sum(diag(p_ST))
# producers accuracy
PA_ST <- diag(p_ST) / colSums(p_ST)
# users accuracy
UA_ST <- diag(p_ST) / rowSums(p_ST)
# gather results
result_ST <- matrix(c(p_area_ST, PA_ST * 100, UA_ST * 100, c(OA_ST * 100, rep(NA, nclass_ST-1))), nrow = nclass_ST)
result_ST <- round(result_ST, digits = 2)
rownames(result_ST) <- levels(as.factor(trainST$classID))
colnames(result_ST) <- c("km²", "PA", "UA", "OA")
class(result_ST) <- "table"
result_ST
## km² PA UA OA
## 1 0.94 1.61 15.79 47.75
## 2 3.68 43.19 39.50
## 3 0.75 25.59 46.91
## 4 4.58 64.81 54.40
## 5 1.49 59.87 59.02
## 6 1.74 36.61 37.84
Let’s plot some maps with the resulting classification
# Set the map's properties
Map1 <- ggplot() +
ggRGB(mosaic_C3,
r = 3,
g = 2,
b = 1,
stretch = "lin",
ggLayer = TRUE) +
geom_sf(data = trainC3,
size = 0.4,
aes(color = factor(classID))) +
scale_color_manual(name = 'Classes',
values = c('#8c510a', '#d8b365', '#f6e8c3','#c7eae5','#5ab4ac','#01665e'),
labels = c('Informal Type I','Informal Type II', 'Informal Type III',
'Formal', 'No-settlement', 'Roads and asphalt')) +
theme(legend.title = element_text(size = 9),
legend.text = element_text(size = 6),
legend.key = element_rect(fill = NA),
legend.position= c(1.2, 0.62),
axis.ticks = element_line(colour = "grey70", size = 0.2),
panel.grid.major = element_line(colour = "grey70", size = 0.2),
panel.background = element_blank()) +
labs(x="",
y="") +
ggsn::scalebar(data = trainC3,
dist = 0.5,
dist_unit = "km",
transform = FALSE,
height = 0.01,
border.size = 0.5,
location = "bottomleft",
st.dist = 0.05) +
north(data=trainC3,
location= "topright",
symbol = 10)
Map1
# Uncomment to Save the map
# ggsave("Map1.png",
# plot = Map1,
# dpi = 600)
# Map 3 - Classification maps
# Map classification #1
# convert raster in data frame
mapVal <- rasterToPoints(map)
mapdf <- data.frame(mapVal)
colnames(mapdf) <- c('Longitude', 'Latitude', 'classID')
# Set the map's properties
Map3_1 <- ggplot() +
geom_raster(data = mapdf,
aes(y=Latitude,
x=Longitude,
fill = factor(classID))) +
coord_equal() +
scale_fill_manual(name = 'Classes',
values = c('#8c510a', '#d8b365', '#f6e8c3','#c7eae5','#5ab4ac','#01665e'),
labels = c('Informal Type I','Informal Type II', 'Informal Type III',
'Formal', 'No-settlement', 'Roads and asphalt')) +
theme(legend.title = element_text(size = 15),
legend.text = element_text(size = 12),
legend.key = element_rect(fill = NA),
axis.ticks = element_blank(),
axis.text = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank()) +
labs(title="Predicted Classification Map. RFC Spectral",
x="",
y="") +
ggsn::scalebar(data = trainC3,
dist = 0.5,
dist_unit = "km",
transform = FALSE,
height = 0.02,
border.size = 0.5,
location = "bottomright",
st.dist = 0.05) +
north(data=trainC3,
location= "topright",
symbol = 10)
Map3_1
# Map classification #2
# convert raster in data frame
mapValST <- rasterToPoints(mapST)
mapdfST <- data.frame(mapValST)
colnames(mapdfST) <- c('Longitude', 'Latitude', 'classID')
# Set the map's properties
Map3_2 <- ggplot() +
geom_raster(data = mapdfST,
aes(y=Latitude,
x=Longitude,
fill = factor(classID))) +
coord_equal() +
scale_fill_manual(name = 'Classes',
values = c('#8c510a', '#d8b365', '#f6e8c3','#c7eae5','#5ab4ac','#01665e'),
labels = c('Informal Type I','Informal Type II', 'Informal Type III',
'Formal', 'No-settlement', 'Roads and asphalt')) +
theme(legend.title = element_text(size = 15),
legend.text = element_text(size = 12),
legend.key = element_rect(fill = NA),
axis.ticks = element_blank(),
axis.text = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank()) +
labs(title="Predicted Classification Map. RFC Spectral + Texture",
x="",
y="") +
ggsn::scalebar(data = trainC3,
dist = 0.5,
dist_unit = "km",
transform = FALSE,
height = 0.01,
border.size = 0.5,
location = "bottomright",
st.dist = 0.05) +
north(data=trainC3,
location= "topright",
symbol = 10)
Map3_2
# Map 4 - Slum maps (by type)
# Map slums #1
# Subset dataframe
mapdf_slums <- filter(mapdf, classID == '1' | classID == '2' | classID == '3')
# Set the map's properties
Map4_1 <- ggplot() +
ggRGB(mosaic_C3,
r = 3,
g = 2,
b = 1,
stretch = "lin",
ggLayer = TRUE) +
geom_tile(data = mapdf_slums,
aes(y=Latitude,
x=Longitude,
fill = factor(classID))) +
coord_equal() +
scale_fill_manual(name = 'Classes',
values = c('#8c510a', '#d8b365', '#f6e8c3'),
labels = c('Informal Type I','Informal Type II', 'Informal Type III')) +
theme(legend.title = element_text(size = 15),
legend.text = element_text(size = 12),
legend.key = element_rect(fill = NA),
axis.ticks = element_blank(),
axis.text = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank()) +
labs(title="Predicted Informal Settlements Classification Map. RFC Spectral",
x="",
y="") +
ggsn::scalebar(data = trainC3,
dist = 0.5,
dist_unit = "km",
transform = FALSE,
height = 0.01,
border.size = 0.5,
location = "bottomright",
st.dist = 0.05) +
north(data=trainC3,
location= "topright",
symbol = 10)
Map4_1
# Map slums #2
# Subset dataframe
mapdfST_slums <- filter(mapdfST, classID == '1' | classID == '2' | classID == '3')
# Set the map's properties
Map4_2 <- ggplot() +
ggRGB(mosaic_C3,
r = 3,
g = 2,
b = 1,
stretch = "lin",
ggLayer = TRUE) +
geom_tile(data = mapdfST_slums,
aes(y=Latitude,
x=Longitude,
fill = factor(classID))) +
coord_equal() +
scale_fill_manual(name = 'Classes',
values = c('#8c510a', '#d8b365', '#f6e8c3'),
labels = c('Informal Type I','Informal Type II', 'Informal Type III')) +
theme(legend.title = element_text(size = 15),
legend.text = element_text(size = 12),
legend.key = element_rect(fill = NA),
axis.ticks = element_blank(),
axis.text = element_blank(),
panel.grid.major = element_blank(),
panel.background = element_blank()) +
labs(title="Predicted Informal Settlements Classification Map. RFC Spectral + Texture",
x="",
y="") +
ggsn::scalebar(data = trainC3,
dist = 0.5,
dist_unit = "km",
transform = FALSE,
height = 0.01,
border.size = 0.5,
location = "bottomright",
st.dist = 0.05) +
north(data=trainC3,
location= "topright",
symbol = 10)
Map4_2